Setup

# Import functions
root <- "./"  
source(file.path(root,"general_functions.R"))
import_parameters(params)
## **** __Utilized Cores__ **** = 64$subsetGenes
## [1] "protein_coding"
## 
## $subsetCells
## [1] "F"
## 
## $resolution
## [1] "0.5"
## 
## $resultsPath
## [1] "Results/subsetGenes-protein_coding__subsetCells-F__Resolution-0.5__perplexity-30__nCores-64"
## 
## $nCores
## [1] 64
## 
## $perplexity
## [1] "30"

__Results/subsetGenes-protein_coding__subsetCells-F__Resolution-0.5__perplexity-30nCores-64

Load Libraries

library(Seurat)
library(dplyr)
library(gridExtra)
library(knitr) 
library(plotly)
library(ggplot2)
library(shiny) 
library(DT) 

sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS release 6.2 (Final)
## 
## Matrix products: default
## BLAS/LAPACK: /hpc/packages/minerva-common/intel/parallel_studio_xe_2018/compilers_and_libraries_2018.1.163/linux/mkl/lib/intel64_lin/libmkl_gf_lp64.so
## 
## locale:
## [1] C
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] DT_0.4        shiny_1.1.0   plotly_4.7.1  knitr_1.20    gridExtra_2.3
##  [6] dplyr_0.7.6   Seurat_2.3.4  Matrix_1.2-14 cowplot_0.9.3 ggplot2_3.0.0
## 
## loaded via a namespace (and not attached):
##   [1] Rtsne_0.13          colorspace_1.3-2    class_7.3-14       
##   [4] modeltools_0.2-22   ggridges_0.5.0      mclust_5.4.1       
##   [7] rprojroot_1.3-2     htmlTable_1.12      base64enc_0.1-3    
##  [10] rstudioapi_0.7      proxy_0.4-22        flexmix_2.3-14     
##  [13] bit64_0.9-7         mvtnorm_1.0-8       codetools_0.2-15   
##  [16] splines_3.5.1       R.methodsS3_1.7.1   robustbase_0.93-1.1
##  [19] Formula_1.2-3       jsonlite_1.5        ica_1.0-2          
##  [22] cluster_2.0.7-1     kernlab_0.9-26      png_0.1-7          
##  [25] R.oo_1.22.0         compiler_3.5.1      httr_1.3.1         
##  [28] backports_1.1.2     assertthat_0.2.0    lazyeval_0.2.1     
##  [31] later_0.7.3         lars_1.2            acepack_1.4.1      
##  [34] htmltools_0.3.6     tools_3.5.1         bindrcpp_0.2.2     
##  [37] igraph_1.2.1        gtable_0.2.0        glue_1.3.0         
##  [40] RANN_2.6            reshape2_1.4.3      Rcpp_1.0.0         
##  [43] trimcluster_0.1-2.1 gdata_2.18.0        ape_5.1            
##  [46] nlme_3.1-137        iterators_1.0.10    fpc_2.1-11         
##  [49] lmtest_0.9-36       stringr_1.3.1       mime_0.5           
##  [52] irlba_2.3.2         gtools_3.8.1        DEoptimR_1.0-8     
##  [55] MASS_7.3-50         zoo_1.8-3           scales_0.5.0       
##  [58] doSNOW_1.0.16       promises_1.0.1      parallel_3.5.1     
##  [61] RColorBrewer_1.1-2  yaml_2.1.19         reticulate_1.7     
##  [64] pbapply_1.3-4       rpart_4.1-13        segmented_0.5-3.0  
##  [67] latticeExtra_0.6-28 stringi_1.2.4       foreach_1.4.4      
##  [70] checkmate_1.8.5     caTools_1.17.1      SDMTools_1.1-221   
##  [73] rlang_0.2.1         pkgconfig_2.0.2     dtw_1.20-1         
##  [76] prabclus_2.2-6      bitops_1.0-6        evaluate_0.11      
##  [79] lattice_0.20-35     ROCR_1.0-7          purrr_0.2.5        
##  [82] bindr_0.1.1         htmlwidgets_1.2     bit_1.1-14         
##  [85] tidyselect_0.2.4    plyr_1.8.4          magrittr_1.5       
##  [88] R6_2.3.0            snow_0.4-3          gplots_3.0.1       
##  [91] Hmisc_4.1-1         pillar_1.3.0        foreign_0.8-70     
##  [94] withr_2.1.2         fitdistrplus_1.0-9  mixtools_1.1.0     
##  [97] survival_2.42-6     nnet_7.3-12         tibble_1.4.2       
## [100] tsne_0.1-3          crayon_1.3.4        hdf5r_1.0.0        
## [103] KernSmooth_2.23-15  rmarkdown_1.10      grid_3.5.1         
## [106] data.table_1.11.8   metap_0.9           digest_0.6.17      
## [109] diptest_0.75-7      xtable_1.8-2        tidyr_0.8.1        
## [112] httpuv_1.4.5        R.utils_2.7.0       stats4_3.5.1       
## [115] munsell_0.5.0       viridisLite_0.3.0
print(paste("Seurat ", packageVersion("Seurat")))
## [1] "Seurat  2.3.4"

Load Data

## ! IMPORTANT! Must not setwd to local path when launching on cluster
# setwd("~/Desktop/PD_scRNAseq/") 
load(file.path(root, "Data/seurat_object_add_HTO_ids.Rdata"))
DAT <- seurat.obj  
rm(seurat.obj)

Pre-filtered Dimensions

DAT
## An object of class seurat in project RAJ_13357 
##  24914 genes across 22113 samples.

Clean Metadata

Add Metadata

metadata <- read.table(file.path(root,"Data/meta.data4.tsv"))
createDT( metadata[1:100,], caption = "Metadata")  
## Warning in instance$preRenderHook(instance): It seems your data is too
## big for client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
# Make AgeGroups
makeAgeGroups <- function(){
  dim(metadata)
  getMaxRound <- function(vals=metadata$Age, unit=10)unit*ceiling((max(vals)/unit))
  getMinRound <- function(vals=metadata$Age, unit=10)unit*floor((min(vals)/unit)) 
   
  ageBreaks = c(seq(getMinRound(), getMaxRound(), by = 10), getMaxRound()+10)
  AgeGroupsUniq <- c()
  for (i in 1:(length(ageBreaks)-1)){ 
    AgeGroupsUniq <- append(AgeGroupsUniq, paste(ageBreaks[i],ageBreaks[i+1], sep="-")) 
  } 
  data.table::setDT(metadata,keep.rownames = T,check.names = F)[, AgeGroups := cut(Age, 
                                  breaks = ageBreaks, 
                                  right = F, 
                                  labels = AgeGroupsUniq,
                                  nclude.lowest=T)]
  metadata <- data.frame(metadata)
  unique(metadata$AgeGroups)
  head(metadata)
  dim(metadata)
  return(metadata)
}
# metadata <- makeAgeGroups()

DAT <- AddMetaData(object = DAT, metadata = metadata)  
# Get rid of any NAs (cells that don't match up with the metadata) 
if(params$subsetCells==F){
  DAT <- FilterCells(object = DAT,  subset.names = "nGene", low.thresholds = 0)
} else {DAT <- FilterCells(object = DAT,  subset.names = "nGene", low.thresholds = 0,
                    # Subset for testing 
                    cells.use = DAT@cell.names[0:params$subsetCells ]
                    )
}  
## Warning in as.list.environment(environment(), all = TRUE): NAs introduced
## by coercion
## Error in 0:params$subsetCells: NA/NaN argument
DAT

An object of class seurat in project RAJ_13357 24914 genes across 22113 samples.

Filter & Normalize Data

Subset Genes by Biotype

Include only subsets of genes by type. Biotypes from: https://useast.ensembl.org/info/genome/genebuild/biotypes.html

subsetBiotypes <- function(DAT, subsetGenes){
  if( subsetGenes!=F ){
    cat(paste("Subsetting genes:",subsetGenes, "\n"))
    # If the gene_biotypes file exists, import csv. Otherwise, get from biomaRt
    if(file_test("-f", file.path(root, "Data/gene_biotypes.csv"))){
      biotypes <- read.csv(file.path(root, "Data/gene_biotypes.csv"))
    }
    else {
      ensembl <- useMart(biomart="ENSEMBL_MART_ENSEMBL", host="grch37.ensembl.org",
                       dataset="hsapiens_gene_ensembl") 
      ensembl <- useDataset(mart = ensembl, dataset = "hsapiens_gene_ensembl")
      listFilters(ensembl)
      listAttributes(ensembl)   
      biotypes <- getBM(attributes=c("hgnc_symbol", "gene_biotype"), filters="hgnc_symbol",
            values=row.names(DAT@data), mart=ensembl) 
      write.csv(biotypes, file.path(root,"Data/gene_biotypes.csv"), quote=F, row.names=F)
    } 
    # Subset data by creating new Seurat object (annoying but necessary)
    geneSubset <- biotypes[biotypes$gene_biotype==subsetGenes,"hgnc_symbol"] 
    
    cat(paste(dim(DAT@raw.data[geneSubset, ])[1],"/", dim(DAT@raw.data)[1], 
                "genes are", subsetGenes))
    # Add back into DAT 
    subset.matrix <- DAT@raw.data[geneSubset, ] # Pull the raw expression matrix from the original Seurat object containing only the genes of interest
    DAT_sub <- CreateSeuratObject(subset.matrix) # Create a new Seurat object with just the genes of interest
    orig.ident <- row.names(DAT@meta.data) # Pull the identities from the original Seurat object as a data.frame
    DAT_sub <- AddMetaData(object = DAT_sub, metadata = DAT@meta.data) # Add the idents to the meta.data slot
    DAT_sub <- SetAllIdent(object = DAT_sub, id = "ident") # Assign identities for the new Seurat object
    DAT <- DAT_sub
    rm(list = c("DAT_sub","geneSubset", "subset.matrix", "orig.ident")) 
  } 
  return(DAT)
}

DAT <- subsetBiotypes(DAT, params$subsetGenes)
## Subsetting genes: protein_coding 
## 14827 / 24914 genes are protein_coding

Subset Cells

Filter cells and normalize

cat("Total Cells:", length(DAT@cell.names), "\n")
## Total Cells: 27863
DAT <- FilterCells(object = DAT, subset.names = c("nGene", "percent.mito"), 
    low.thresholds = c(200, -Inf), high.thresholds = c(2500, 0.05))
cat("Filtered Cells:", length(DAT@cell.names))
## Filtered Cells: 19144
DAT <- NormalizeData(object = DAT, normalization.method = "LogNormalize", 
    scale.factor = 10000)

Subset Genes by Variance

Important!: * In ScaleData… + IMPORTANT!: Must specify do.par = F (“If set to TRUE, will use half of the machine’s available cores”) + Specify num.cores = nCores (to use all available cores, determined by parallel::detectCores())

Regress out: number of unique transcripts (nUMI), % mitochondrial transcripts (percent.mito)

# Store the top most variable genes in @var.genes
DAT <- FindVariableGenes(object = DAT, mean.function = ExpMean, dispersion.function = LogVMR,
    x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5)

cat("Total Genes:", length(row.names(DAT@raw.data)), "\n")
## Total Genes: 14827
cat("Highly Variable Genes:", length(DAT@var.genes))
## Highly Variable Genes: 1926
# IMPORTANT!: Use only the var.genes identified by 'FindVariableGenes' as the 'gene.use' arg in 'ScaleData'
## This will greatly reduced the computational load.

# Ensure CD14 and CD16 are included
appendedGenes <- c(DAT@var.genes, "CD14", "FCGR3A")
DAT <- ScaleData(object = DAT, genes.use = appendedGenes , vars.to.regress = c("nUMI", "percent.mito"), 
                  do.par = F, num.cores = params$nCores)
## Regressing out: nUMI, percent.mito
## Warning in RegressOutResid(object = object, vars.to.regress =
## vars.to.regress, : For parallel processing, please set do.par to TRUE.
## 
## Time Elapsed:  19.941748380661 secs
## Scaling data matrix

Filtered Dimensions

DAT
## An object of class seurat in project SeuratProject 
##  14827 genes across 19144 samples.

Diagnostic Plots

Violin Plots

vp <- VlnPlot(object = DAT, features.plot = c("nGene", "nUMI", "percent.mito"),nCol = 3, do.return = T) %>% + ggplot2::aes(alpha=0.5)
vp

Gene Plots

percent.mito plot

# results = 'hide', fig.show='hide'
# par(mfrow = c(1, 2))
# do.hover <- ifelse(interactive==T, T, F)
GenePlot(object = DAT, gene1 = "nUMI", gene2 = "percent.mito", pch.use=20) 

         #do.hover=do.hover, data.hover = "mut")

nGene plot

# , results = 'hide', fig.show='hide'
# do.hover <-ifelse(interactive==T, T, F)
GenePlot(object = DAT, gene1 = "nUMI", gene2 = "nGene", pch.use=20)

         #do.hover=do.hover, data.hover = "mut")

Dimensionality Reduction & Clustering

PCA

ProjectPCA scores each gene in the dataset (including genes not included in the PCA) based on their correlation with the calculated components. Though we don’t use this further here, it can be used to identify markers that are strongly correlated with cellular heterogeneity, but may not have passed through variable gene selection. The results of the projected PCA can be explored by setting use.full=T in the functions above

  • Other Dim Reduction Methods in Seurat
  • RunCCA()
  • RunMultiCCA()
  • RunDiffusion()
  • RunPHATE()
  • RunICA()
# Run PCA with only the top most variables genes
DAT <- RunPCA(object = DAT, pc.genes = DAT@var.genes, do.print=F, verbose=F) #, pcs.print = 1:5,  genes.print = 5
# Store in Seurat object so you don't have to recalculate it for the tSNE/UMAP steps
DAT <- ProjectPCA(object = DAT, do.print=F) 

VizPCA

VizPCA(object = DAT, pcs.use = 1:2)

PCHeatmaps

# 'PCHeatmap' is a wrapper for heatmap.2  
PCHeatmap(object = DAT, pc.use = 1:12,do.balanced=T, label.columns=F, use.full=F)   # cells.use = 500, 

Significant PCs

Determine statistically significant PCs for further analysis. NOTE: This process can take a long time for big datasets, comment out for expediency. More approximate techniques such as those implemented in PCElbowPlot() can be used to reduce computation time

#DAT <- JackStraw(object = DAT, num.replicate = 100, display.progress = FALSE)
PCElbowPlot(object = DAT)

Find Cell Clusters

  • We first construct a KNN graph based on the euclidean distance in PCA space, and refine the edge weights between any two cells based on the shared overlap in their local neighborhoods (Jaccard similarity). To cluster the cells, we apply modularity optimization techniques such as the Louvain algorithm (default) or SLM [SLM, Blondel et al., Journal of Statistical Mechanics], to iteratively group cells together, with the goal of optimizing the standard modularity function.
  • The clustering approach in FindClusters was heavily inspired by recent manuscripts which applied graph-based clustering approaches to scRNA-seq data SNN-Cliq, Xu and Su, Bioinformatics, 2015 and CyTOF data PhenoGraph, Levine et al., Cell, 2015.
  • To further increase speed, you can employ an approximate nearest neighbor search via the RANN package by increasing the nn.eps + parameter. Setting this at 0 (the default) represents an exact neighbor search.
  • By default, we perform 100 random starts for clustering and select the result with highest modularity. You can lower this through the n.start parameter to reduce clustering time.
  • Algorithm for modularity optimization (1 = original Louvain algorithm; 2 = Louvain algorithm with multilevel refinement; 3 = SLM algorithm).

  • On Resolution
    NA

DAT <- RunTSNE(object=DAT,  reduction.use = "pca", dims.use = 1:10, do.fast = TRUE,
                 tsne.method = "Rtsne", num_threads=params$nCores, verbose=F) #   FItSNE 
  #perplexity = as.numeric(params$perplexity),

# TRY DIFFERENT RESOLUTIONS
DAT <- StashIdent(object = DAT, save.name = "pre_clustering") 
# DAT <- SetAllIdent(object = DAT, id = "pre_clustering") 

DAT <- FindClusters(object = DAT, reduction.type = "pca", dims.use = 1:10, algorithm = 1,
                     resolution =  params$resolution, print.output = F, save.SNN = T,
                     n.start = 10, nn.eps = 0.5, plot.SNN = T, force.recalc=T) 
## Error in RunModularityClusteringCpp(SNN, modularity, resolution, algorithm, : Not compatible with requested type: [type=character; target=double].

PrintFindClustersParams(object = DAT) 
## Error in PrintFindClustersParams(object = DAT): No stored clusterings.
DAT <- StashIdent(object = DAT, save.name = "post_clustering") 

PCA plot

# do.hover <-ifelse(interactive==T, T, F)
PCAPlot(object = DAT, dim.1 = 1, dim.2 = 2, group.by="post_clustering")#, do.hover=do.hover, data.hover="mut")

UMAP

Additional UMAP arguments detailed here: https://umap-learn.readthedocs.io/en/latest/api.html#module-umap.umap_

# cat(print(mem_used()))
DAT <- RunUMAP(object = DAT, reduction.use = "pca", dims.use = 1:10, verbose=TRUE, 
               num_threads=params$nCores, 
               genes.use = DAT@var.genes)
# cat(print(mem_used()))
# Plot results
DimPlot(object = DAT, reduction.use = 'umap')

t-SNE

As input to the tSNE, we suggest using the same PCs as input to the clustering analysis, although computing the tSNE based on scaled gene expression is also supported using the genes.use argument.

Important!: Specify num_threads=0 in ‘RunTSNE’ to use all available cores.

“FItSNE”, a new fast implementation of t-SNE, is also available through RunTSNE. However FItSNE must first be setup on your computer.

labSize <- 12 
tsnePlot <- TSNEPlot(object = DAT, do.label=T, label.size = labSize, do.return=T) + 
  scale_color_brewer( palette = "Set1", aesthetics = aes(alpha=.5))


# 
# customColors <- function(var="post_clustering", palette="Set1"){
#   add.alpha <- function(col, alpha=1){ 
#     if(missing(col))
#       stop("Please provide a vector of colours.")
#      apply(sapply(col, col2rgb)/255, 2, 
#                        function(x) 
#                          rgb(x[1], x[2], x[3], alpha=alpha))  
#   } 
#   cluster_colors  <- RColorBrewer::brewer.pal( length(unique(DAT@meta.data[var])), palette)
#   cluster_colors_transparent <- add.alpha(cluster_colors, .5) %>% as.character()
#   return(cluster_colors_transparent)
# }


# # Try t-SNE at different perplexities
# for (i in c(5,20,30,100)){
#    cat('\n')   
#    cat("### t-SNE: perplexity =",i,"\n") 
#    DAT <- RunTSNE(object=DAT,  reduction.use = "pca", dims.use = 1:10, do.fast = TRUE,  
#                 perplexity = i, tsne.method = "Rtsne", 
#                 num_threads=params$nCores,
#                  genes.use = DAT@var.genes,
#                 verbose=F) #   FItSNE
#   tsnePlot <- TSNEPlot(object = DAT, do.label=T, label.size = labSize, do.return=T) + 
#     scale_color_brewer( palette = "Set1", aesthetics = aes(alpha=.5))
#   print(tsnePlot)
#   cat('\n')   
# } 

t-SNE & Metadata Plots

tSNE_metadata_plot <- function(var, labSize=12){ 
  # Metadata plot  
  p1 <- TSNEPlot(DAT, do.return = T,  do.label = T,  group.by = var,label.size = labSize,
                 plot.title=paste(var), vector.friendly=T) +
    theme(legend.position = "top") + 
    scale_color_brewer( palette = "Dark2",  aesthetics = aes(alpha=.5)) 
     
  # t-SNE clusters plot
  p2 <- TSNEPlot(DAT, do.return = T, do.label = T,label.size = labSize,
                 plot.title=paste("Unsupervised Clusters"), vector.friendly=T)  +
    theme(legend.position = "top") + 
    scale_color_brewer( palette = "Set1", aesthetics = aes(alpha=.5))  
  print(plot_grid(p1,p2))
     
}   
# Iterate plots over metadata variables
metaVars <- c("dx","mut","Gender","Age", "ID")
for (var in metaVars){
  cat('\n')
  cat("### t-SNE Metadata plot for ",var, "\n") 
  tSNE_metadata_plot(var)
  cat('\n') 
} 

t-SNE Metadata plot for dx

t-SNE Metadata plot for mut

t-SNE Metadata plot for Gender

t-SNE Metadata plot for Age

t-SNE Metadata plot for ID

Save Results

save.image(file.path(resultsPath, "scRNAseq_results.RData"))